{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The MIT License (MIT)\n",
    "\n",
    "# Copyright (c) 2020, NVIDIA CORPORATION.\n",
    "\n",
    "# Permission is hereby granted, free of charge, to any person obtaining a copy of\n",
    "# this software and associated documentation files (the \"Software\"), to deal in\n",
    "# the Software without restriction, including without limitation the rights to\n",
    "# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of\n",
    "# the Software, and to permit persons to whom the Software is furnished to do so,\n",
    "# subject to the following conditions:\n",
    "\n",
    "# The above copyright notice and this permission notice shall be included in all\n",
    "# copies or substantial portions of the Software.\n",
    "\n",
    "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
    "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS\n",
    "# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR\n",
    "# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER\n",
    "# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n",
    "# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial: Feature Engineering for Recommender Systems\n",
    "\n",
    "# 3. Feature Engineering - Categorical\n",
    "\n",
    "## 3.2. Categorify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import IPython\n",
    "\n",
    "import pandas as pd\n",
    "import cudf\n",
    "import numpy as np\n",
    "import cupy\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_train = cudf.read_parquet('./data/train.parquet')\n",
    "df_valid = cudf.read_parquet('./data/valid.parquet')\n",
    "df_test = cudf.read_parquet('./data/test.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_time</th>\n",
       "      <th>event_type</th>\n",
       "      <th>product_id</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_session</th>\n",
       "      <th>target</th>\n",
       "      <th>cat_0</th>\n",
       "      <th>cat_1</th>\n",
       "      <th>cat_2</th>\n",
       "      <th>cat_3</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>ts_hour</th>\n",
       "      <th>ts_minute</th>\n",
       "      <th>ts_weekday</th>\n",
       "      <th>ts_day</th>\n",
       "      <th>ts_month</th>\n",
       "      <th>ts_year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2019-12-01 00:00:28 UTC</td>\n",
       "      <td>cart</td>\n",
       "      <td>17800342</td>\n",
       "      <td>zeta</td>\n",
       "      <td>66.90</td>\n",
       "      <td>550465671</td>\n",
       "      <td>22650a62-2d9c-4151-9f41-2674ec6d32d5</td>\n",
       "      <td>0</td>\n",
       "      <td>computers</td>\n",
       "      <td>desktop</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>2019-12-01 00:00:28</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>2019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-12-01 00:00:39 UTC</td>\n",
       "      <td>cart</td>\n",
       "      <td>3701309</td>\n",
       "      <td>polaris</td>\n",
       "      <td>89.32</td>\n",
       "      <td>543733099</td>\n",
       "      <td>a65116f4-ac53-4a41-ad68-6606788e674c</td>\n",
       "      <td>0</td>\n",
       "      <td>appliances</td>\n",
       "      <td>environment</td>\n",
       "      <td>vacuum</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>2019-12-01 00:00:39</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>2019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2019-12-01 00:00:40 UTC</td>\n",
       "      <td>cart</td>\n",
       "      <td>3701309</td>\n",
       "      <td>polaris</td>\n",
       "      <td>89.32</td>\n",
       "      <td>543733099</td>\n",
       "      <td>a65116f4-ac53-4a41-ad68-6606788e674c</td>\n",
       "      <td>0</td>\n",
       "      <td>appliances</td>\n",
       "      <td>environment</td>\n",
       "      <td>vacuum</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>2019-12-01 00:00:40</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>2019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-12-01 00:00:41 UTC</td>\n",
       "      <td>cart</td>\n",
       "      <td>3701309</td>\n",
       "      <td>polaris</td>\n",
       "      <td>89.32</td>\n",
       "      <td>543733099</td>\n",
       "      <td>a65116f4-ac53-4a41-ad68-6606788e674c</td>\n",
       "      <td>0</td>\n",
       "      <td>appliances</td>\n",
       "      <td>environment</td>\n",
       "      <td>vacuum</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>2019-12-01 00:00:41</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>2019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2019-12-01 00:01:56 UTC</td>\n",
       "      <td>cart</td>\n",
       "      <td>1004767</td>\n",
       "      <td>samsung</td>\n",
       "      <td>235.60</td>\n",
       "      <td>579970209</td>\n",
       "      <td>c6946211-ce70-4228-95ce-fd7fccdde63c</td>\n",
       "      <td>0</td>\n",
       "      <td>construction</td>\n",
       "      <td>tools</td>\n",
       "      <td>light</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>2019-12-01 00:01:56</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>2019</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                event_time event_type  product_id    brand   price    user_id  \\\n",
       "0  2019-12-01 00:00:28 UTC       cart    17800342     zeta   66.90  550465671   \n",
       "1  2019-12-01 00:00:39 UTC       cart     3701309  polaris   89.32  543733099   \n",
       "2  2019-12-01 00:00:40 UTC       cart     3701309  polaris   89.32  543733099   \n",
       "3  2019-12-01 00:00:41 UTC       cart     3701309  polaris   89.32  543733099   \n",
       "4  2019-12-01 00:01:56 UTC       cart     1004767  samsung  235.60  579970209   \n",
       "\n",
       "                           user_session  target         cat_0        cat_1  \\\n",
       "0  22650a62-2d9c-4151-9f41-2674ec6d32d5       0     computers      desktop   \n",
       "1  a65116f4-ac53-4a41-ad68-6606788e674c       0    appliances  environment   \n",
       "2  a65116f4-ac53-4a41-ad68-6606788e674c       0    appliances  environment   \n",
       "3  a65116f4-ac53-4a41-ad68-6606788e674c       0    appliances  environment   \n",
       "4  c6946211-ce70-4228-95ce-fd7fccdde63c       0  construction        tools   \n",
       "\n",
       "    cat_2 cat_3            timestamp  ts_hour  ts_minute  ts_weekday  ts_day  \\\n",
       "0    <NA>  <NA>  2019-12-01 00:00:28        0          0           6       1   \n",
       "1  vacuum  <NA>  2019-12-01 00:00:39        0          0           6       1   \n",
       "2  vacuum  <NA>  2019-12-01 00:00:40        0          0           6       1   \n",
       "3  vacuum  <NA>  2019-12-01 00:00:41        0          0           6       1   \n",
       "4   light  <NA>  2019-12-01 00:01:56        0          1           6       1   \n",
       "\n",
       "   ts_month  ts_year  \n",
       "0        12     2019  \n",
       "1        12     2019  \n",
       "2        12     2019  \n",
       "3        12     2019  \n",
       "4        12     2019  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = 'product_id'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Theory"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<b>*Categorifying*</b> is required for using categorical features in deep learning models with Embedding layers. An Embedding layer encodes the category into a hidden latent vector with a smaller dimension.<br><br>\n",
    "Categorical features can be from datatype String or Integer. The Embedding layer requires that categorical features are continoues, positive Integers from 0 to |C| (number of unique category values)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are 164453 unique product values but the ProductIDs range from 1000894 to 100144608."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0           1000894\n",
       "1           1000978\n",
       "2           1001588\n",
       "3           1001605\n",
       "4           1001606\n",
       "            ...    \n",
       "164448    100143856\n",
       "164449    100143867\n",
       "164450    100144046\n",
       "164451    100144443\n",
       "164452    100144608\n",
       "Name: product_id, Length: 164453, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train[cat].unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using factorize creates continous Integers from a categorical fateature."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes, uniques = df_train[cat].factorize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0           65426\n",
       "1           10158\n",
       "2           10158\n",
       "3           10158\n",
       "4             775\n",
       "            ...  \n",
       "11461352      862\n",
       "11461353     1064\n",
       "11461354      775\n",
       "11461355    10158\n",
       "11461356    91487\n",
       "Length: 11461357, dtype: int32"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0              0\n",
       "1              1\n",
       "2              2\n",
       "3              3\n",
       "4              4\n",
       "           ...  \n",
       "164448    164448\n",
       "164449    164449\n",
       "164450    164450\n",
       "164451    164451\n",
       "164452    164452\n",
       "Length: 164453, dtype: int32"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "codes.unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another important reason to Categorify categorical features is to reduce the size of the dataset. Often categorical features are of the datatype String and sometimes, they are hashed to protect the user / dataset privacy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import hashlib\n",
    "from sys import getsizeof"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For example, we can hash the Integer 0 to a md5 hash"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'cfcd208495d565ef66e7dff9f98764da'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hashlib.md5(b'0').hexdigest()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can hash the full product_id column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "hashSeries = df_train[cat].to_pandas().apply(lambda x: hashlib.md5(bytes(str(x), encoding='utf-8')).hexdigest())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0           5ebc4b45850c48658af86229318ccbea\n",
       "1           4b9dde859aa2809cc367fc44aa05eb4a\n",
       "2           4b9dde859aa2809cc367fc44aa05eb4a\n",
       "3           4b9dde859aa2809cc367fc44aa05eb4a\n",
       "4           e5e26a76d8aee9c4f8b4cd9cb8633577\n",
       "                          ...               \n",
       "11461352    4202ee67e0c3f9b1ebcfb622d9974e07\n",
       "11461353    5a06406ed78aab3c94bfefcdeb528eaf\n",
       "11461354    e5e26a76d8aee9c4f8b4cd9cb8633577\n",
       "11461355    4b9dde859aa2809cc367fc44aa05eb4a\n",
       "11461356    388318441a4807e254acf9c3f207969d\n",
       "Name: product_id, Length: 11461357, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hashSeries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1020060933"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "getsizeof(hashSeries)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes, uniques = hashSeries.factorize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "91691016"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "getsizeof(pd.DataFrame(codes)[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We require only 9% of the original DataSeries memory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.08988778320363339"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "91691016/1020060933"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, we can prevent overfitting for low frequency categories. Categories with low frequency can be grouped together to an new category called 'other'. In the previous exercise we learned that it is powerful to combine categorical features together to create a new feature. However, combining categories increases the cardinality of the new feature and the number of obersations per category will decrease. Therefore, we can apply a treshhold to group all categories with lower frequency count to the the new category.<br><br>\n",
    "In addition, categories, which occure in the validation dataset and do not occur in the trainint dataset, should be mapped to the 'other' category as well.<br><br>\n",
    "We use in our example the categoryIds 0 or 1 for a placeholder for the low frequency and unkown category. Then our function is independent of the cardinality of the categorical feature and we do not keep records of the cardinality to know the low frequency/unkown category."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In our dataset, we see that multiple product_ids occure only once in the training dataset. Our model would overfit to these low frequent categories."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1004767      317711\n",
       "1005115      251189\n",
       "1004856      227432\n",
       "4804056      224545\n",
       "1005100      180072\n",
       "              ...  \n",
       "100143590         1\n",
       "100143856         1\n",
       "100143867         1\n",
       "100144046         1\n",
       "100144443         1\n",
       "Name: product_id, Length: 164453, dtype: int32"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train[cat].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "freq = df_train[cat].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "freq = freq.reset_index()\n",
    "freq.columns = [cat, 'count']\n",
    "freq = freq.reset_index()\n",
    "freq.columns = [cat + '_Categorify', cat, 'count']\n",
    "freq_filtered = freq[freq['count']>5]\n",
    "freq_filtered[cat + '_Categorify'] = freq_filtered[cat + '_Categorify']+1\n",
    "freq_filtered = freq_filtered.drop('count', axis=1)\n",
    "df_train = df_train.merge(freq_filtered, how='left', on=cat)\n",
    "df_train[cat + '_Categorify'] = df_train[cat + '_Categorify'].fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 76404, (76405,))"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train['product_id_Categorify'].min(), df_train['product_id_Categorify'].max(), df_train['product_id_Categorify'].drop_duplicates().shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We need to apply the categorify to our validation and test sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_valid = df_valid.merge(freq_filtered, how='left', on=cat)\n",
    "df_valid[cat + '_Categorify'] = df_valid[cat + '_Categorify'].fillna(0)\n",
    "\n",
    "df_test = df_test.merge(freq_filtered, how='left', on=cat)\n",
    "df_test[cat + '_Categorify'] = df_test[cat + '_Categorify'].fillna(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Summary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<li> Categorify is important to enable deep learning models to use categorical features<br>\n",
    "<li> Categorify can significantly reduce the dataset size by tranforming categorical features from String datatypes to Integer datatypes<br>\n",
    "<li> Categorify can prevent overfitting by grouping categories with low frequency into one category together"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Practice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, it is your turn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**ToDo:**\n",
    "<li> Categorify the category features brand\n",
    "<li> Apply a frequency treshhold of minimum 20\n",
    "<li> Map low frequency categories to the id=0\n",
    "<li> Map unkown categories to the id=1 in the validation and test set\n",
    "\n",
    "    \n",
    "**Question:**\n",
    "<li> How many data points have an unknown category in the test dataset?\n",
    "<li> How many data points have a low frequency category in the test dataset?\n",
    "<li> How many data points have a low frequency category in the training dataset?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "### ToDo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "############### Solution ###############"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "############### Solution End ###########"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optimization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's compare the runtime between pandas and cuDF. The implementation depends only on the DataFrame object (calling function of the object) and does not require any pd / cuDF function. Therefore, we can use the same implementation and just use pandas.DataFrame and cuDF.DataFrame. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def categorify(df_train, df_valid, df_test, cat, freq_treshhold=20, unkown_id=1, lowfrequency_id=0):\n",
    "    freq = df_train[cat].value_counts()\n",
    "    freq = freq.reset_index()\n",
    "    freq.columns = [cat, 'count']\n",
    "    freq = freq.reset_index()\n",
    "    freq.columns = [cat + '_Categorify', cat, 'count']\n",
    "    freq[cat + '_Categorify'] = freq[cat + '_Categorify']+2\n",
    "    freq.loc[freq['count']<freq_treshhold, cat + '_Categorify'] = lowfrequency_id\n",
    "    \n",
    "    freq = freq.drop('count', axis=1)\n",
    "    df_train = df_train.merge(freq, how='left', on=cat)\n",
    "    df_train[cat + '_Categorify'] = df_train[cat + '_Categorify'].fillna(unkown_id)\n",
    "    \n",
    "    df_valid = df_valid.merge(freq, how='left', on=cat)\n",
    "    df_valid[cat + '_Categorify'] = df_valid[cat + '_Categorify'].fillna(unkown_id)\n",
    "    \n",
    "    df_test = df_test.merge(freq, how='left', on=cat)\n",
    "    df_test[cat + '_Categorify'] = df_test[cat + '_Categorify'].fillna(unkown_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_pd = df_train.to_pandas()\n",
    "df_valid_pd = df_valid.to_pandas()\n",
    "df_test_pd = df_test.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 15.5 s, sys: 5.77 s, total: 21.3 s\n",
      "Wall time: 21.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "categorify(df_train_pd, df_valid_pd, df_test_pd, 'user_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 168 ms, sys: 296 ms, total: 464 ms\n",
      "Wall time: 463 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "categorify(df_train, df_valid, df_test, 'user_id')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In our experiments, running the same implementation is 63x times faster with cuDF instead of pandas."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We shutdown the kernel."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'status': 'ok', 'restart': False}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "app = IPython.Application.instance()\n",
    "app.kernel.do_shutdown(False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
